x— title: “EDA” output: html_document: default word_document: default date: “2023-11-21” —

library(tidyverse)
library(lubridate)
library(ggplot2)
library(cowplot)
library(dplyr)
library(DT)
library(GGally)
maindata <- read.csv("maindata.csv")
data_monthly <- read.csv("data_monthly.csv")

longtype

# Create a monthly date sequence from January 1990 to November 2023.
dates <- seq(ymd("1990-01-01"), ymd("2023-11-01"), by = "1 month")

# Assume maindata has been loaded into the environment. Now, add the date column.
maindata <- maindata %>%
  mutate(date = dates)

# Melt the data into a long format suitable for ggplot2, excluding 'uvindex' and 'solarradiation' due to NaN values.
maindata_long <- maindata %>%
  gather(key = "variable", value = "value", -date) %>%
  filter(!is.nan(value))

Plot the annual mean temperature

# Assuming maindata_long is already created with the 'date' and 'value' columns
# and the 'variable' column indicating the type of measurement, such as temperature.

# First, we will filter out temperature-related variables only
maindata_temp <- maindata_long %>%
  filter(variable %in% c("temp")) # Replace with actual temperature variable names

# Extract the year from the date
maindata_temp <- maindata_temp %>%
  mutate(year = year(date))

# Assuming maindata_long is already created with the 'date' and 'value' columns
# and the 'variable' column indicating the type of measurement, such as temperature.

# First, we will filter out temperature-related variables only
maindata_temp <- maindata_long %>%
  filter(variable %in% c("temp")) # Replace with actual temperature variable names

# Extract the year from the date
maindata_temp <- maindata_temp %>%
  mutate(year = year(date))

# Calculate the annual mean temperature
annual_mean_temp <- maindata_temp %>%
  group_by(year, variable) %>%
  summarise(year_mean = mean(value, na.rm = TRUE)) %>%
  ungroup()

# Plot the annual mean temperature
annual_mean_temp_plot <- annual_mean_temp %>%
  filter(variable == "temp") %>% 
  ggplot(aes(x = year, y = year_mean)) +
  geom_point() +
  geom_line() +
  geom_smooth(method = "loess") +
  theme(
    axis.title.x = element_blank(),
    legend.position = "none",
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5) 
  ) +
  labs(
    title = "Annual Mean Temperature",
    subtitle = "Data from Jan. 1990 to Nov. 2023",
    y = "Degrees Celsius"
  ) +
  NULL

annual_mean_temp_plot

Plot the monthly mean temperature

# Assuming maindata_long has already been created and includes the 'date' and 'value' columns
# We'll use the 'temp' variable for plotting the mean temperature

# First, extract the year and month from the date
maindata_temp <- maindata_long %>%
  filter(variable == "temp") %>%
  mutate(year = year(date), month = month(date, label = TRUE))

# Calculate the monthly mean temperature
monthly_mean_temp <- maindata_temp %>%
  group_by(year, month) %>%
  summarise(month_mean = mean(value, na.rm = TRUE)) %>%
  ungroup()

# Define a colour scale for the months
my_colour <- scales::hue_pal()(12)

# Plot the monthly mean temperature
temp_plot <- ggplot(monthly_mean_temp, aes(x = year, y = month_mean, colour = month)) +
  geom_point(size = 0.5) +
  geom_smooth(method = "loess") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
    axis.title.x = element_blank(),
    legend.position = "none",
    plot.title = element_text(hjust = 0.5, face = "bold"), 
    plot.subtitle = element_text(hjust = 0.5) 
  ) +
  scale_color_manual(values = my_colour) +
  labs(
    title = "Monthly Mean Temperature",
    subtitle = "Data from Jan. 1990 to Nov. 2023",
    y = "Degrees Celsius"
  ) +
  facet_wrap(~month) +
  NULL

temp_plot

Plot the annual mean humidity

maindata_humidity<- maindata_long %>%
  filter(variable %in% c("humidity")) 

# Extract the year from the date
maindata_humidity <- maindata_humidity %>%
  mutate(year = year(date))

# First, we will filter out temperature-related variables only
maindata_humidity <- maindata_long %>%
  filter(variable %in% c("humidity")) 

# Extract the year from the date
maindata_humidity <- maindata_humidity %>%
  mutate(year = year(date))

# Calculate the annual mean humidity
annual_mean_humidity <- maindata_humidity %>%
  group_by(year, variable) %>%
  summarise(year_mean = mean(value, na.rm = TRUE)) %>%
  ungroup()

# Plot the annual mean humidity with the title and subtitle centered
annual_mean_humidity_plot <- annual_mean_humidity %>%
  filter(variable == "humidity") %>% 
  ggplot(aes(x = year, y = year_mean)) +
  geom_point() +
  geom_line() +
  geom_smooth(method = "loess") +
  theme(
    axis.title.x = element_blank(),
    legend.position = "none",
    plot.title = element_text(hjust = 0.5, face = "bold"), 
    plot.subtitle = element_text(hjust = 0.5) 
  ) +
  labs(
    title = "Annual Mean Humidity",
    subtitle = "Data from Jan. 1990 to Nov. 2023",
    y = "%"
  ) +
  NULL

annual_mean_humidity_plot

Plot the monthly mean humidity

# First, extract the year and month from the date
maindata_humidity <- maindata_long %>%
  filter(variable == "humidity") %>%
  mutate(year = year(date), month = month(date, label = TRUE))

# Calculate the monthly mean temperature
monthly_mean_humidity <- maindata_humidity %>%
  group_by(year, month) %>%
  summarise(month_mean = mean(value, na.rm = TRUE)) %>%
  ungroup()

# Define a colour scale for the months
my_colour <- scales::hue_pal()(12)

# Plot the monthly mean temperature
humidity_plot <- ggplot(monthly_mean_humidity, aes(x = year, y = month_mean, colour = month)) +
  geom_point(size = 0.5) +
  geom_smooth(method = "loess") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
    axis.title.x = element_blank(),
    legend.position = "none",
    plot.title = element_text(hjust = 0.5, face = "bold"), 
    plot.subtitle = element_text(hjust = 0.5) 
  ) +
  scale_color_manual(values = my_colour) +
  labs(
    title = "Monthly Mean humidity",
    subtitle = "Data from Jan. 1990 to Nov. 2023",
    y = "Degrees Celsius"
  ) +
  facet_wrap(~month) +
  NULL

humidity_plot

Plot the annual mean windspeed

maindata_windspeed<- maindata_long %>%
  filter(variable %in% c("windspeed")) 

# Extract the year from the date
maindata_windspeed <- maindata_windspeed %>%
  mutate(year = year(date))

# First, we will filter out temperature-related variables only
maindata_windspeed <- maindata_long %>%
  filter(variable %in% c("windspeed")) 

# Extract the year from the date
maindata_windspeed <- maindata_windspeed %>%
  mutate(year = year(date))

# Calculate the annual mean humidity
annual_mean_windspeed <- maindata_windspeed %>%
  group_by(year, variable) %>%
  summarise(year_mean = mean(value, na.rm = TRUE)) %>%
  ungroup()

# Plot the annual mean windspeed with the title and subtitle centered
annual_mean_windspeed_plot <- annual_mean_windspeed %>%
  filter(variable == "windspeed") %>% 
  ggplot(aes(x = year, y = year_mean)) +
  geom_point() +
  geom_line() +
  geom_smooth(method = "loess") +
  theme(
    axis.title.x = element_blank(),
    legend.position = "none",
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5) 
  ) +
  labs(
    title = "Annual Mean Wind Speed",
    subtitle = "Data from Jan. 1990 to Nov. 2023",
    y = "Kilometers per hour (kph) "
  ) +
  NULL

annual_mean_windspeed_plot

Plot the monthly mean windspeed

# First, extract the year and month from the date
maindata_windspeed <- maindata_long %>%
  filter(variable == "windspeed") %>%
  mutate(year = year(date), month = month(date, label = TRUE))

# Calculate the monthly mean temperature
monthly_mean_windspeed <- maindata_windspeed %>%
  group_by(year, month) %>%
  summarise(month_mean = mean(value, na.rm = TRUE)) %>%
  ungroup()

# Define a colour scale for the months
my_colour <- scales::hue_pal()(12)

# Plot the monthly mean temperature
windspeed_plot <- ggplot(monthly_mean_windspeed, aes(x = year, y = month_mean, colour = month)) +
  geom_point(size = 0.5) +
  geom_smooth(method = "loess") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
    axis.title.x = element_blank(),
    legend.position = "none",
    plot.title = element_text(hjust = 0.5, face = "bold"), 
    plot.subtitle = element_text(hjust = 0.5) 
  ) +
  scale_color_manual(values = my_colour) +
  labs(
    title = "Monthly Mean Wind Speed",
    subtitle = "Data from Jan. 1990 to Nov. 2023",
    y = "Kilometers per hour (kph) "
  ) +
  facet_wrap(~month) +
  NULL

windspeed_plot

Plot the annual mean cloudcover

maindata_cloudcover<- maindata_long %>%
  filter(variable %in% c("cloudcover")) 

# Extract the year from the date
maindata_cloudcover <- maindata_cloudcover %>%
  mutate(year = year(date))

# First, we will filter out temperature-related variables only
maindata_cloudcover <- maindata_long %>%
  filter(variable %in% c("cloudcover")) 

# Extract the year from the date
maindata_cloudcover <- maindata_cloudcover %>%
  mutate(year = year(date))

# Calculate the annual mean cloudcover
annual_mean_cloudcover <- maindata_cloudcover %>%
  group_by(year, variable) %>%
  summarise(year_mean = mean(value, na.rm = TRUE)) %>%
  ungroup()

# Plot the annual mean cloudcover with the title and subtitle centered
annual_mean_cloudcover_plot <- annual_mean_cloudcover %>%
  filter(variable == "cloudcover") %>% 
  ggplot(aes(x = year, y = year_mean)) +
  geom_point() +
  geom_line() +
  geom_smooth(method = "loess") +
  theme(
    axis.title.x = element_blank(),
    legend.position = "none",
    plot.title = element_text(hjust = 0.5, face = "bold"), 
    plot.subtitle = element_text(hjust = 0.5) 
  ) +
  labs(
    title = "Annual Mean Cloud Cover",
    subtitle = "Data from Jan. 1990 to Nov. 2023",
    y = "% "
  ) +
  NULL

annual_mean_cloudcover_plot

Plot the monthly mean cloudcover

# First, extract the year and month from the date
maindata_cloudcover <- maindata_long %>%
  filter(variable == "cloudcover") %>%
  mutate(year = year(date), month = month(date, label = TRUE))

# Calculate the monthly mean temperature
monthly_mean_cloudcover <- maindata_cloudcover %>%
  group_by(year, month) %>%
  summarise(month_mean = mean(value, na.rm = TRUE)) %>%
  ungroup()

# Define a colour scale for the months
my_colour <- scales::hue_pal()(12)

# Plot the monthly mean temperature
cloudcover_plot <- ggplot(monthly_mean_cloudcover, aes(x = year, y = month_mean, colour = month)) +
  geom_point(size = 0.5) +
  geom_smooth(method = "loess") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
    axis.title.x = element_blank(),
    legend.position = "none",
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5)
  ) +
  scale_color_manual(values = my_colour) +
  labs(
    title = "Monthly Mean Cloud Cover",
    subtitle = "Data from 1990 - 2023",
    y = "%"
  ) +
  facet_wrap(~month) +
  NULL

cloudcover_plot

Plot the annual mean sealevelpressure

maindata_sealevelpressure<- maindata_long %>%
  filter(variable %in% c("sealevelpressure")) 

# Extract the year from the date
maindata_sealevelpressure <- maindata_sealevelpressure %>%
  mutate(year = year(date))

# First, we will filter out temperature-related variables only
maindata_sealevelpressure <- maindata_long %>%
  filter(variable %in% c("sealevelpressure")) 

# Extract the year from the date
maindata_sealevelpressure <- maindata_sealevelpressure %>%
  mutate(year = year(date))

# Calculate the annual mean humidity
annual_mean_sealevelpressure <- maindata_sealevelpressure %>%
  group_by(year, variable) %>%
  summarise(year_mean = mean(value, na.rm = TRUE)) %>%
  ungroup()

# Plot the annual mean sealevelpressure with the title and subtitle centered
annual_mean_sealevelpressure_plot <- annual_mean_sealevelpressure %>%
  filter(variable == "sealevelpressure") %>% 
  ggplot(aes(x = year, y = year_mean)) +
  geom_point() +
  geom_line() +
  geom_smooth(method = "loess") +
  theme(
    axis.title.x = element_blank(),
    legend.position = "none",
    plot.title = element_text(hjust = 0.5, face = "bold"), 
    plot.subtitle = element_text(hjust = 0.5) 
  ) +
  labs(
    title = "Annual Mean Sea Level Pressure",
    subtitle = "Data from Jan. 1990 to Nov. 2023",
    y = "Millibars (mb)"
  ) +
  NULL

annual_mean_sealevelpressure_plot

Plot the monthly mean sealevelpressure

# First, extract the year and month from the date
maindata_sealevelpressure <- maindata_long %>%
  filter(variable == "sealevelpressure") %>%
  mutate(year = year(date), month = month(date, label = TRUE))

# Calculate the monthly mean temperature
monthly_mean_sealevelpressure <- maindata_sealevelpressure %>%
  group_by(year, month) %>%
  summarise(month_mean = mean(value, na.rm = TRUE)) %>%
  ungroup()

# Define a colour scale for the months
my_colour <- scales::hue_pal()(12)

# Plot the monthly mean temperature
sealevelpressure_plot <- ggplot(monthly_mean_sealevelpressure, aes(x = year, y = month_mean, colour = month)) +
  geom_point(size = 0.5) +
  geom_smooth(method = "loess") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
    axis.title.x = element_blank(),
    legend.position = "none",
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5)
  ) +
  scale_color_manual(values = my_colour) +
  labs(
    title = "Monthly Mean Sea Level Pressure",
    subtitle = "Data from Jan. 1990 to Nov. 2023",
    y = "Millibars (mb)"
  ) +
  facet_wrap(~month) +
  NULL

sealevelpressure_plot

Plot the annual mean precip

maindata_precip<- maindata_long %>%
  filter(variable %in% c("precip")) 

# Extract the year from the date
maindata_precip <- maindata_precip %>%
  mutate(year = year(date))

# First, we will filter out temperature-related variables only
maindata_precip <- maindata_long %>%
  filter(variable %in% c("precip")) 

# Extract the year from the date
maindata_precip <- maindata_precip %>%
  mutate(year = year(date))

# Calculate the annual mean humidity
annual_mean_precip <- maindata_precip %>%
  group_by(year, variable) %>%
  summarise(year_mean = mean(value, na.rm = TRUE)) %>%
  ungroup()

# Plot the annual mean precip with the title and subtitle centered
annual_mean_precip_plot <- annual_mean_precip %>%
  filter(variable == "precip") %>% 
  ggplot(aes(x = year, y = year_mean)) +
  geom_point() +
  geom_line() +
  geom_smooth(method = "loess") +
  theme(
    axis.title.x = element_blank(),
    legend.position = "none",
    plot.title = element_text(hjust = 0.5, face = "bold"), 
    plot.subtitle = element_text(hjust = 0.5) 
  ) +
  labs(
    title = "Annual Mean Precipitation",
    subtitle = "Data from Jan. 1990 to Nov. 2023",
    y = "mm"
  ) +
  NULL

annual_mean_precip_plot

Plot the monthly mean precip

# First, extract the year and month from the date
maindata_precip <- maindata_long %>%
  filter(variable == "precip") %>%
  mutate(year = year(date), month = month(date, label = TRUE))

# Calculate the monthly mean temperature
monthly_mean_precip <- maindata_precip %>%
  group_by(year, month) %>%
  summarise(month_mean = mean(value, na.rm = TRUE)) %>%
  ungroup()

# Define a colour scale for the months
my_colour <- scales::hue_pal()(12)

# Plot the monthly mean temperature
precip_plot <- ggplot(monthly_mean_precip, aes(x = year, y = month_mean, colour = month)) +
  geom_point(size = 0.5) +
  geom_smooth(method = "loess") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
    axis.title.x = element_blank(),
    legend.position = "none",
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5)
  ) +
  scale_color_manual(values = my_colour) +
  labs(
    title = "Monthly Precipitation",
    subtitle = "Data from Jan. 1990 to Nov. 2023",
    y = "mm"
  ) +
  facet_wrap(~month) +
  NULL

precip_plot

Trend of each feature_v2

data_monthly |>
  ggplot(aes(x=month, y=temp, col=year)) +
  geom_jitter() + 
  geom_smooth(method="smooth") +
  theme_cowplot() +
  xlab("Month") + ylab("Temperature") 

Boxplot: temp

temp_data <- maindata_long %>%
  filter(variable == "temp") %>%
  mutate(month = factor(month(date, label = TRUE)))  # Converting the date to a month factor

# Creating the boxplot for each month
temp_boxplot <- ggplot(temp_data, aes(x = month, y = value)) +
  geom_boxplot() +
  labs(title = "Monthly Boxplots of Temperature",
       subtitle = "Data from Jan. 1990 to Nov. 2023",
       x = "Month",
       y = "Degrees Celsius") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5))

print(temp_boxplot)

## Boxplot: temp with outlier

temp_data <- maindata_long %>%
  filter(variable == "temp") %>%
  mutate(year = as.character(year(date)),
         month = factor(month(date, label = TRUE)))

outliers <- temp_data %>%
  group_by(month) %>%
  summarise(lower = quantile(value, 0.25) - 1.5 * IQR(value),
            upper = quantile(value, 0.75) + 1.5 * IQR(value)) %>%
  left_join(temp_data, by = "month") %>%
  filter(value < lower | value > upper)

temp_boxplot_outlier <- ggplot(temp_data, aes(x = month, y = value)) +
  geom_boxplot() +
  geom_text(data = outliers, aes(label = year), vjust = -0.5) +
  labs(title = "Monthly Boxplots of Temperature",
       subtitle = "Data from Jan. 1990 to Nov. 2023",
       x = "Month",
       y = "Degrees Celsius") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5))

print(temp_boxplot_outlier)

Boxplot: humidity

humidity_data <- maindata_long %>%
  filter(variable == "humidity") %>%
  mutate(month = factor(month(date, label = TRUE)))  # Converting the date to a month factor

# Creating the boxplot for each month
humidity_boxplot <- ggplot(temp_data, aes(x = month, y = value)) +
  geom_boxplot() +
  labs(title = "Monthly Boxplots of Humidity",
       subtitle = "Data from Jan. 1990 to Nov. 2023",
       x = "Month",
       y = "%") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5))

print(humidity_boxplot)

## Boxplot: humidity with outlier

humidity_data <- maindata_long %>%
  filter(variable == "humidity") %>%
  mutate(year = as.character(year(date)),
         month = factor(month(date, label = TRUE)))

outliers <- humidity_data %>%
  group_by(month) %>%
  summarise(lower = quantile(value, 0.25) - 1.5 * IQR(value),
            upper = quantile(value, 0.75) + 1.5 * IQR(value)) %>%
  left_join(humidity_data, by = "month") %>%
  filter(value < lower | value > upper)

humidity_boxplot_outlier <- ggplot(humidity_data, aes(x = month, y = value)) +
  geom_boxplot() +
  geom_text(data = outliers, aes(label = year), vjust = -0.5) +
  labs(title = "Monthly Boxplots of Humidity",
       subtitle = "Data from Jan. 1990 to Nov. 2023",
       x = "Month",
       y = "%") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5))

print(humidity_boxplot_outlier)

## Boxplot: windspeed

windspeed_data <- maindata_long %>%
  filter(variable == "windspeed") %>%
  mutate(month = factor(month(date, label = TRUE)))  # Converting the date to a month factor

# Creating the boxplot for each month
windspeed_boxplot <- ggplot(temp_data, aes(x = month, y = value)) +
  geom_boxplot() +
  labs(title = "Monthly Boxplots of Wind Speed",
       subtitle = "Data from Jan. 1990 to Nov. 2023",
       x = "Month",
       y = "Kilometers per hour (kph)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5))

print(windspeed_boxplot)

Boxplot: windspeed with outlier

windspeed_data <- maindata_long %>%
  filter(variable == "windspeed") %>%
  mutate(year = as.character(year(date)),
         month = factor(month(date, label = TRUE)))

outliers <- windspeed_data %>%
  group_by(month) %>%
  summarise(lower = quantile(value, 0.25) - 1.5 * IQR(value),
            upper = quantile(value, 0.75) + 1.5 * IQR(value)) %>%
  left_join(windspeed_data, by = "month") %>%
  filter(value < lower | value > upper)

windspeed_boxplot_outlier <- ggplot(windspeed_data, aes(x = month, y = value)) +
  geom_boxplot() +
  geom_text(data = outliers, aes(label = year), vjust = -0.5) +
  labs(title = "Monthly Boxplots of Wind Speed",
       subtitle = "Data from Jan. 1990 to Nov. 2023",
       x = "Month",
       y = "Kilometers per hour (kph)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5))

print(windspeed_boxplot_outlier)

Boxplot: cloudcover

cloudcover_data <- maindata_long %>%
  filter(variable == "cloudcover") %>%
  mutate(month = factor(month(date, label = TRUE)))  # Converting the date to a month factor

# Creating the boxplot for each month
cloudcover_boxplot <- ggplot(temp_data, aes(x = month, y = value)) +
  geom_boxplot() +
  labs(title = "Monthly Boxplots of Cloud Cover",
       subtitle = "Data from Jan. 1990 to Nov. 2023",
       x = "Month",
       y = "%") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5))

print(cloudcover_boxplot)

Boxplot: cloudcover with outlier

cloudcover_data <- maindata_long %>%
  filter(variable == "cloudcover") %>%
  mutate(year = as.character(year(date)),
         month = factor(month(date, label = TRUE)))

outliers <- cloudcover_data %>%
  group_by(month) %>%
  summarise(lower = quantile(value, 0.25) - 1.5 * IQR(value),
            upper = quantile(value, 0.75) + 1.5 * IQR(value)) %>%
  left_join(cloudcover_data, by = "month") %>%
  filter(value < lower | value > upper)

cloudcover_boxplot_outlier <- ggplot(cloudcover_data, aes(x = month, y = value)) +
  geom_boxplot() +
  geom_text(data = outliers, aes(label = year), vjust = -0.5) +
  labs(title = "Monthly Boxplots of Cloud Cover",
       subtitle = "Data from Jan. 1990 to Nov. 2023",
       x = "Month",
       y = "%") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5))

print(cloudcover_boxplot_outlier)

Boxplot: sealevelpressure

sealevelpressure_data <- maindata_long %>%
  filter(variable == "sealevelpressure") %>%
  mutate(month = factor(month(date, label = TRUE)))  # Converting the date to a month factor

# Creating the boxplot for each month
sealevelpressure_boxplot <- ggplot(temp_data, aes(x = month, y = value)) +
  geom_boxplot() +
  labs(title = "Monthly Boxplots of Sea Level Pressure",
       subtitle = "Data from Jan. 1990 to Nov. 2023",
       x = "Month",
       y = "Millibars (mb)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5))

print(sealevelpressure_boxplot)

Boxplot: sealevelpressure with outlier

sealevelpressure_data <- maindata_long %>%
  filter(variable == "sealevelpressure") %>%
  mutate(year = as.character(year(date)),
         month = factor(month(date, label = TRUE)))

outliers <- sealevelpressure_data %>%
  group_by(month) %>%
  summarise(lower = quantile(value, 0.25) - 1.5 * IQR(value),
            upper = quantile(value, 0.75) + 1.5 * IQR(value)) %>%
  left_join(sealevelpressure_data, by = "month") %>%
  filter(value < lower | value > upper)

sealevelpressure_boxplot_outlier <- ggplot(sealevelpressure_data, aes(x = month, y = value)) +
  geom_boxplot() +
  geom_text(data = outliers, aes(label = year), vjust = -0.5) +
  labs(title = "Monthly Boxplots of Sea Level Pressure",
       subtitle = "Data from Jan. 1990 to Nov. 2023",
       x = "Month",
       y = "Millibars (mb)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5))

print(sealevelpressure_boxplot_outlier)

Boxplot: precip

precip_data <- maindata_long %>%
  filter(variable == "precip") %>%
  mutate(month = factor(month(date, label = TRUE)))  # Converting the date to a month factor

# Creating the boxplot for each month
precip_boxplot <- ggplot(temp_data, aes(x = month, y = value)) +
  geom_boxplot() +
  labs(title = "Monthly Boxplots of Precipitation",
       subtitle = "Data from Jan. 1990 to Nov. 2023",
       x = "Month",
       y = "mm") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5))

print(precip_boxplot)

Boxplot: precip with precip

precip_data <- maindata_long %>%
  filter(variable == "precip") %>%
  mutate(year = as.character(year(date)),
         month = factor(month(date, label = TRUE)))

outliers <- precip_data %>%
  group_by(month) %>%
  summarise(
    lower = quantile(value, 0.25, na.rm = TRUE) - 1.5 * IQR(value, na.rm = TRUE),
    upper = quantile(value, 0.75, na.rm = TRUE) + 1.5 * IQR(value, na.rm = TRUE)
  ) %>%
  left_join(precip_data, by = "month") %>%
  filter(value < lower | value > upper)


precip_boxplot_outlier <- ggplot(precip_data, aes(x = month, y = value)) +
  geom_boxplot() +
  geom_text(data = outliers, aes(label = year), vjust = -0.5) +
  labs(title = "Monthly Boxplots of Precipitation",
       subtitle = "Data from Jan. 1990 to Nov. 2023",
       x = "Month",
       y = "mm") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5))

print(precip_boxplot_outlier)

Conditions: ‘Clear’, ‘Overcast’, ‘Partially cloudy’,‘Rain, Partially cloudy’, ‘Rain, Overcast’, ‘Rain’

# First, we will filter out condition-related variables only
maindata_condition <- maindata_long %>% 
  filter(variable %in% 
           c("Clear","Overcast","Partially.cloudy", 
             "Rain..Partially.cloudy", "Rain..Overcast", "Rain")) 


# Extract the year from the date
maindata_condition <- maindata_condition %>%
  mutate(year = year(date))

# Calculate the annual mean frequency
annual_mean_condition <- maindata_condition %>%
  group_by(year, variable) %>%
  summarise(year_mean = mean(value, na.rm = TRUE)) %>%
  ungroup()

# Plot the annual mean frequency with the title and subtitle centered
annual_mean_condition_plot <- annual_mean_condition %>%
  ggplot(aes(x = year, y = year_mean)) +
  geom_point() +
  geom_line() +
  geom_smooth(method = "loess") +
  facet_wrap(~variable) + 
  theme(
    axis.title.x = element_blank(),
    legend.position = "none",
    plot.title = element_text(hjust = 0.5, face = "bold"), 
    plot.subtitle = element_text(hjust = 0.5) 
  ) +
  labs(
    title = "Annual Mean Frequency of Weather Condition",
    subtitle = "Data from Jan. 1990 to Nov. 2023",
    y = "Count"
  ) +
  NULL

annual_mean_condition_plot

# First, extract the year and month from the date
maindata_condition <- maindata_long %>% 
  filter(variable %in% 
           c("Clear","Overcast","Partially.cloudy", 
             "Rain..Partially.cloudy", "Rain..Overcast", "Rain")) %>% 
  mutate(year = year(date), month = month(date, label = TRUE))

# Calculate the monthly mean frequency
monthly_mean_condition <- maindata_condition %>%
  group_by(variable, year, month) %>%
  summarise(month_mean = mean(value, na.rm = TRUE)) %>%
  ungroup()

# Define a colour scale for the months
my_colour <- scales::hue_pal()(12)

# Plot 
condition_plot <- ggplot(monthly_mean_condition, aes(x = year, y = month_mean, colour = month)) +
  geom_point(size = 0.5) +
  geom_smooth(method = "loess") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
    axis.title.x = element_blank(),
    legend.position = "none",
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5)
  ) +
  scale_color_manual(values = my_colour) +
  labs(
    title = "Monthly Frequency of Weather Condition",
    subtitle = "Data from Jan. 1990 to Nov. 2023",
    y = "Count"
  ) +
  facet_grid(variable~month) +
  NULL

condition_plot

Yearly Basic Statistics Table (excluding the condition variables)

feture: temp, humidity, windspeed, cloudcover, sealevelpressure, precip

If you want to summarize the data by month instead of by year, you can add ‘group_by(year, month)’

library(DT)
maindata %>% 
    mutate(year = year(date), month = month(date, label = TRUE)) -> data_summary

data_summary %>%
  select(year, temp, humidity, windspeed, cloudcover, sealevelpressure, precip) %>% 
  group_by(year) %>%
  summarise(across(everything(), 
                   list(min = ~min(., na.rm = TRUE), 
                       max = ~max(., na.rm = TRUE), 
                       mean = ~mean(., na.rm = TRUE)))) %>%
  mutate_all(~round(., 3)) -> datatable1

datatable(datatable1, options = list(autoWidth = FALSE, scrollX = TRUE))

Yearly Basic Statistics Table (only the condition variables)

If you want to summarize the data by month instead of by year, you can add ‘group_by(year, month)’

data_summary %>% 
    select(Clear:month) %>%
    group_by(year) %>%
    summarise_at(vars(Clear:Rain), 
                      list(min = ~min(., na.rm = TRUE), 
                           max = ~max(., na.rm = TRUE), 
                           mean = ~mean(., na.rm = TRUE))) %>%
  mutate_all(~ifelse(is.finite(.), ., NA)) %>% 
  mutate_all(~round(., 3)) -> datatable2

datatable(datatable2, options = list(autoWidth = FALSE, scrollX = TRUE))

Yearly Basic Statistics Table

If you want to summarize the data by month instead of by year, you can add ‘group_by(year, month)’

# Joining the two tables on the 'year' column
final_table <- left_join(datatable1, datatable2, by = "year")

# Displaying the combined table
datatable(final_table, options = list(autoWidth = FALSE, scrollX = TRUE))

#Excluding the variables ‘Clear’, ‘Overcast’, ‘Partially cloudy’, ‘Rain, Partially cloudy’, ‘Rain, Overcast’, ‘Rain’, as they make the data look too messy. Should I add them back? # If you want to summarize the data by month instead of by year, you can add ‘group_by(year, month)’

# heatmap(Excluding the 'Clear', 'Overcast', 'Partially cloudy','Rain, Partially cloudy', 'Rain, Overcast', 'Rain')
data_summary %>%
  select(temp, humidity, windspeed, cloudcover, sealevelpressure, precip) -> cordata

library(GGally)
ggpairs(cordata)

cordata %>%
  cor(use = "complete.obs") -> cor_matrix

heatmap(cor_matrix, Colv=NA, Rowv = NA, scale = "column") 

library(corrplot)
corrplot(cor_matrix, method = 'color', type = 'lower', order = 'hclust',
         tl.col = 'black', addCoef.col = 'black', number.cex = 0.6,
         cl.ratio = 0.2, tl.srt = 45, col = COL2('RdBu', 10))

## leave blank on non-significant coefficient
## add significant correlation coefficients
testRes = cor.mtest(cordata, conf.level = 0.95)

corrplot(cor_matrix, p.mat = testRes$p, 
         method = 'color', type = 'lower', insig='blank',
         tl.col = 'black', addCoef.col = 'black', number.cex = 0.6,
         order = 'AOE', diag=FALSE)